#Downloading the Data
!pip install pandas
Defaulting to user installation because normal site-packages is not writeable Requirement already satisfied: pandas in c:\programdata\anaconda3\lib\site-packages (1.5.3) Requirement already satisfied: python-dateutil>=2.8.1 in c:\programdata\anaconda3\lib\site-packages (from pandas) (2.8.2) Requirement already satisfied: pytz>=2020.1 in c:\programdata\anaconda3\lib\site-packages (from pandas) (2022.7) Requirement already satisfied: numpy>=1.21.0 in c:\programdata\anaconda3\lib\site-packages (from pandas) (1.24.3) Requirement already satisfied: six>=1.5 in c:\programdata\anaconda3\lib\site-packages (from python-dateutil>=2.8.1->pandas) (1.16.0)
medical_charges_url = 'https://raw.githubusercontent.com/JovianML/opendatasets/master/data/medical-charges.csv'
from urllib.request import urlretrieve
urlretrieve(medical_charges_url, 'medical.csv')
('medical.csv', <http.client.HTTPMessage at 0x1c6081f00d0>)
import pandas as pd
medical_df = pd.read_csv('medical.csv')
medical_df
| age | sex | bmi | children | smoker | region | charges | |
|---|---|---|---|---|---|---|---|
| 0 | 19 | female | 27.900 | 0 | yes | southwest | 16884.92400 |
| 1 | 18 | male | 33.770 | 1 | no | southeast | 1725.55230 |
| 2 | 28 | male | 33.000 | 3 | no | southeast | 4449.46200 |
| 3 | 33 | male | 22.705 | 0 | no | northwest | 21984.47061 |
| 4 | 32 | male | 28.880 | 0 | no | northwest | 3866.85520 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 1333 | 50 | male | 30.970 | 3 | no | northwest | 10600.54830 |
| 1334 | 18 | female | 31.920 | 0 | no | northeast | 2205.98080 |
| 1335 | 18 | female | 36.850 | 0 | no | southeast | 1629.83350 |
| 1336 | 21 | female | 25.800 | 0 | no | southwest | 2007.94500 |
| 1337 | 61 | female | 29.070 | 0 | yes | northwest | 29141.36030 |
1338 rows × 7 columns
medical_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1338 entries, 0 to 1337 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 1338 non-null int64 1 sex 1338 non-null object 2 bmi 1338 non-null float64 3 children 1338 non-null int64 4 smoker 1338 non-null object 5 region 1338 non-null object 6 charges 1338 non-null float64 dtypes: float64(2), int64(2), object(3) memory usage: 73.3+ KB
medical_df.describe()
| age | bmi | children | charges | |
|---|---|---|---|---|
| count | 1338.000000 | 1338.000000 | 1338.000000 | 1338.000000 |
| mean | 39.207025 | 30.663397 | 1.094918 | 13270.422265 |
| std | 14.049960 | 6.098187 | 1.205493 | 12110.011237 |
| min | 18.000000 | 15.960000 | 0.000000 | 1121.873900 |
| 25% | 27.000000 | 26.296250 | 0.000000 | 4740.287150 |
| 50% | 39.000000 | 30.400000 | 1.000000 | 9382.033000 |
| 75% | 51.000000 | 34.693750 | 2.000000 | 16639.912515 |
| max | 64.000000 | 53.130000 | 5.000000 | 63770.428010 |
#Exploratory Analysis and Visualization
!pip install plotly
!pip install matplotlib
!pip install seaborn
Defaulting to user installation because normal site-packages is not writeable Requirement already satisfied: plotly in c:\programdata\anaconda3\lib\site-packages (5.9.0) Requirement already satisfied: tenacity>=6.2.0 in c:\programdata\anaconda3\lib\site-packages (from plotly) (8.2.2) Defaulting to user installation because normal site-packages is not writeable Requirement already satisfied: matplotlib in c:\programdata\anaconda3\lib\site-packages (3.7.1) Requirement already satisfied: contourpy>=1.0.1 in c:\programdata\anaconda3\lib\site-packages (from matplotlib) (1.0.5) Requirement already satisfied: cycler>=0.10 in c:\programdata\anaconda3\lib\site-packages (from matplotlib) (0.11.0) Requirement already satisfied: fonttools>=4.22.0 in c:\programdata\anaconda3\lib\site-packages (from matplotlib) (4.25.0) Requirement already satisfied: kiwisolver>=1.0.1 in c:\programdata\anaconda3\lib\site-packages (from matplotlib) (1.4.4) Requirement already satisfied: numpy>=1.20 in c:\programdata\anaconda3\lib\site-packages (from matplotlib) (1.24.3) Requirement already satisfied: packaging>=20.0 in c:\programdata\anaconda3\lib\site-packages (from matplotlib) (23.0) Requirement already satisfied: pillow>=6.2.0 in c:\programdata\anaconda3\lib\site-packages (from matplotlib) (9.4.0) Requirement already satisfied: pyparsing>=2.3.1 in c:\programdata\anaconda3\lib\site-packages (from matplotlib) (3.0.9) Requirement already satisfied: python-dateutil>=2.7 in c:\programdata\anaconda3\lib\site-packages (from matplotlib) (2.8.2) Requirement already satisfied: six>=1.5 in c:\programdata\anaconda3\lib\site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0) Defaulting to user installation because normal site-packages is not writeable Requirement already satisfied: seaborn in c:\programdata\anaconda3\lib\site-packages (0.12.2) Requirement already satisfied: numpy!=1.24.0,>=1.17 in c:\programdata\anaconda3\lib\site-packages (from seaborn) (1.24.3) Requirement already satisfied: pandas>=0.25 in c:\programdata\anaconda3\lib\site-packages (from seaborn) (1.5.3) Requirement already satisfied: matplotlib!=3.6.1,>=3.1 in c:\programdata\anaconda3\lib\site-packages (from seaborn) (3.7.1) Requirement already satisfied: contourpy>=1.0.1 in c:\programdata\anaconda3\lib\site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (1.0.5) Requirement already satisfied: cycler>=0.10 in c:\programdata\anaconda3\lib\site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (0.11.0) Requirement already satisfied: fonttools>=4.22.0 in c:\programdata\anaconda3\lib\site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (4.25.0) Requirement already satisfied: kiwisolver>=1.0.1 in c:\programdata\anaconda3\lib\site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (1.4.4) Requirement already satisfied: packaging>=20.0 in c:\programdata\anaconda3\lib\site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (23.0) Requirement already satisfied: pillow>=6.2.0 in c:\programdata\anaconda3\lib\site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (9.4.0) Requirement already satisfied: pyparsing>=2.3.1 in c:\programdata\anaconda3\lib\site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (3.0.9) Requirement already satisfied: python-dateutil>=2.7 in c:\programdata\anaconda3\lib\site-packages (from matplotlib!=3.6.1,>=3.1->seaborn) (2.8.2) Requirement already satisfied: pytz>=2020.1 in c:\programdata\anaconda3\lib\site-packages (from pandas>=0.25->seaborn) (2022.7) Requirement already satisfied: six>=1.5 in c:\programdata\anaconda3\lib\site-packages (from python-dateutil>=2.7->matplotlib!=3.6.1,>=3.1->seaborn) (1.16.0)
import plotly.express as px
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (10, 6)
matplotlib.rcParams['figure.facecolor'] = '#00000000'
#Age
#Age is a numeric column. The minimum age in the dataset is 18 and the maximum age is 64. Thus, we can visualize the distribution of age using a histogram with 47 bins (one for each year) and a box plot.
medical_df.age.describe()
count 1338.000000 mean 39.207025 std 14.049960 min 18.000000 25% 27.000000 50% 39.000000 75% 51.000000 max 64.000000 Name: age, dtype: float64
fig = px.histogram(medical_df,
x='age',
marginal='box',
nbins=47,
title='Distribution of Age')
fig.update_layout(bargap=0.1)
fig.show()
#Body Mass Index
#Let's look at the distribution of BMI (Body Mass Index) of customers, using a histogram and box plot.
fig = px.histogram(medical_df,
x='bmi',
marginal='box',
color_discrete_sequence=['red'],
title='Distribution of BMI (Body Mass Index)')
fig.update_layout(bargap=0.1)
fig.show()
#Charges
#Let's visualize the distribution of "charges" i.e. the annual medical charges for customers. This is the column we're trying to predict. Let's also use the categorical column "smoker" to distinguish the charges for smokers and non-smokers.
fig = px.histogram(medical_df,
x='charges',
marginal='box',
color='smoker',
color_discrete_sequence=['green', 'grey'],
title='Annual Medical Charges')
fig.update_layout(bargap=0.1)
fig.show()
#Smoker
#Let's visualize the distribution of the "smoker" column (containing values "yes" and "no") using a histogram.
medical_df.smoker.value_counts()
no 1064 yes 274 Name: smoker, dtype: int64
px.histogram(medical_df, x='smoker', color='sex', title='Smoker')
#Age and Charges
#Let's visualize the relationship between "age" and "charges" using a scatter plot. Each point in the scatter plot represents one customer. We'll also use values in the "smoker" column to color the points.
fig = px.scatter(medical_df,
x='age',
y='charges',
color='smoker',
opacity=0.8,
hover_data=['sex'],
title='Age vs. Charges')
fig.update_traces(marker_size=5)
fig.show()
#We can make the following observations from the above chart:
#The general trend seems to be that medical charges increase with age, as we might expect. However, there is significant variation at every age, and it's clear that age alone cannot be used to accurately determine medical charges.
#We can see three "clusters" of points, each of which seems to form a line with an increasing slope:
# 1.The first and the largest cluster consists primary of presumably "healthy non-smokers" who have relatively low medical charges compared to others
# 2. The second cluster contains a mix of smokers and non-smokers. It's possible that these are actually two distinct but overlapping clusters: "non-smokers with medical issues" and "smokers without major medical issues".
# 3. The final cluster consists exclusively of smokers, presumably smokers with major medical issues that are possibly related to or worsened by smoking.
fig = px.scatter(medical_df,
x='bmi',
y='charges',
color='smoker',
opacity=0.8,
hover_data=['sex'],
title='BMI vs. Charges')
fig.update_traces(marker_size=5)
fig.show()
#Correlation
medical_df.charges.corr(medical_df.age)
0.2990081933306476
medical_df.charges.corr(medical_df.bmi)
0.19834096883362884
smoker_values = {'no': 0, 'yes': 1}
smoker_numeric = medical_df.smoker.map(smoker_values)
medical_df.charges.corr(smoker_numeric)
0.7872514304984767
medical_df.corr()
C:\Users\yashg\AppData\Local\Temp\ipykernel_21820\4290363667.py:1: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
| age | bmi | children | charges | |
|---|---|---|---|---|
| age | 1.000000 | 0.109272 | 0.042469 | 0.299008 |
| bmi | 0.109272 | 1.000000 | 0.012759 | 0.198341 |
| children | 0.042469 | 0.012759 | 1.000000 | 0.067998 |
| charges | 0.299008 | 0.198341 | 0.067998 | 1.000000 |
sns.heatmap(medical_df.corr(), cmap='Reds', annot=True)
plt.title('Correlation Matrix');
C:\Users\yashg\AppData\Local\Temp\ipykernel_21820\2153061389.py:1: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
#Linear Regression using a Single Feature
non_smoker_df = medical_df[medical_df.smoker == 'no']
plt.title('Age vs. Charges')
sns.scatterplot(data=non_smoker_df, x='age', y='charges', alpha=0.7, s=15);
#Model
#linear regression
#charges=w×age+b
def estimate_charges(age, w, b):
return w * age + b
w = 50
b = 100
ages = non_smoker_df.age
estimated_charges = estimate_charges(ages, w, b)
plt.plot(ages, estimated_charges, 'r-o');
plt.xlabel('Age');
plt.ylabel('Estimated Charges');
target = non_smoker_df.charges
plt.plot(ages, estimated_charges, 'r', alpha=0.9);
plt.scatter(ages, target, s=8,alpha=0.8);
plt.xlabel('Age');
plt.ylabel('Charges')
plt.legend(['Estimate', 'Actual']);
def try_parameters(w, b):
ages = non_smoker_df.age
target = non_smoker_df.charges
estimated_charges = estimate_charges(ages, w, b)
plt.plot(ages, estimated_charges, 'r', alpha=0.9);
plt.scatter(ages, target, s=8,alpha=0.8);
plt.xlabel('Age');
plt.ylabel('Charges')
plt.legend(['Estimate', 'Actual']);
try_parameters(60, 200)
try_parameters(400, 5000)
#Loss/Cost Function
#Let's define a function to compute the RMSE.
!pip install numpy
import numpy as np
Defaulting to user installation because normal site-packages is not writeable Requirement already satisfied: numpy in c:\programdata\anaconda3\lib\site-packages (1.24.3)
def rmse(targets, predictions):
return np.sqrt(np.mean(np.square(targets - predictions)))
w = 50
b = 100
try_parameters(w, b)
targets = non_smoker_df['charges']
predicted = estimate_charges(non_smoker_df.age, w, b)
rmse(targets, predicted)
8461.949562575493
def try_parameters(w, b):
ages = non_smoker_df.age
target = non_smoker_df.charges
predictions = estimate_charges(ages, w, b)
plt.plot(ages, predictions, 'r', alpha=0.9);
plt.scatter(ages, target, s=8,alpha=0.8);
plt.xlabel('Age');
plt.ylabel('Charges')
plt.legend(['Prediction', 'Actual']);
loss = rmse(target, predictions)
print("RMSE Loss: ", loss)
try_parameters(50, 100)
RMSE Loss: 8461.949562575493
#Linear Regression using Scikit-learn
!pip install scikit-learn
from sklearn.linear_model import LinearRegression
model = LinearRegression()
Defaulting to user installation because normal site-packages is not writeable Requirement already satisfied: scikit-learn in c:\users\yashg\appdata\roaming\python\python311\site-packages (1.4.1.post1) Requirement already satisfied: numpy<2.0,>=1.19.5 in c:\programdata\anaconda3\lib\site-packages (from scikit-learn) (1.24.3) Requirement already satisfied: scipy>=1.6.0 in c:\programdata\anaconda3\lib\site-packages (from scikit-learn) (1.10.1) Requirement already satisfied: joblib>=1.2.0 in c:\users\yashg\appdata\roaming\python\python311\site-packages (from scikit-learn) (1.3.2) Requirement already satisfied: threadpoolctl>=2.0.0 in c:\programdata\anaconda3\lib\site-packages (from scikit-learn) (2.2.0)
inputs = non_smoker_df[['age']]
targets = non_smoker_df.charges
print('inputs.shape :', inputs.shape)
print('targes.shape :', targets.shape)
inputs.shape : (1064, 1) targes.shape : (1064,)
model.fit(inputs, targets)
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LinearRegression()
model.predict(np.array([[23],
[37],
[61]]))
C:\Users\yashg\AppData\Roaming\Python\Python311\site-packages\sklearn\base.py:493: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
array([ 4055.30443855, 7796.78921819, 14210.76312614])
predictions = model.predict(inputs)
predictions
array([2719.0598744 , 5391.54900271, 6727.79356686, ..., 2719.0598744 ,
2719.0598744 , 3520.80661289])
rmse(targets, predictions)
4662.505766636395
# w
model.coef_
array([267.24891283])
# b
model.intercept_
-2091.4205565650827
try_parameters(model.coef_, model.intercept_)
RMSE Loss: 4662.505766636395
# Create inputs and targets
inputs, targets = non_smoker_df[['age']], non_smoker_df['charges']
# Create and train the model
model = LinearRegression().fit(inputs, targets)
# Generate predictions
predictions = model.predict(inputs)
# Compute loss to evalute the model
loss = rmse(targets, predictions)
print('Loss:', loss)
Loss: 4662.505766636395
#Linear Regression using Multiple Features
#So far, we've used on the "age" feature to estimate "charges". Adding another feature like "bmi" is fairly straightforward. We simply assume the following relationship:
#charges = w_1 \times age + w_2 \times bmi + b
# Create inputs and targets
inputs, targets = non_smoker_df[['age', 'bmi']], non_smoker_df['charges']
# Create and train the model
model = LinearRegression().fit(inputs, targets)
# Generate predictions
predictions = model.predict(inputs)
# Compute loss to evalute the model
loss = rmse(targets, predictions)
print('Loss:', loss)
Loss: 4662.3128354612945
non_smoker_df.charges.corr(non_smoker_df.bmi)
0.08403654312833271
fig = px.scatter(non_smoker_df, x='bmi', y='charges', title='BMI vs. Charges')
fig.update_traces(marker_size=5)
fig.show()
fig = px.scatter_3d(non_smoker_df, x='age', y='bmi', z='charges')
fig.update_traces(marker_size=3, marker_opacity=0.5)
fig.show()
model.coef_, model.intercept_
(array([266.87657817, 7.07547666]), -2293.6320906488654)
non_smoker_df.charges.corr(non_smoker_df.children)
0.13892870453542197
fig = px.strip(non_smoker_df, x='children', y='charges', title= "Children vs. Charges")
fig.update_traces(marker_size=4, marker_opacity=0.7)
fig.show()
# Create inputs and targets
inputs, targets = non_smoker_df[['age', 'bmi', 'children']], non_smoker_df['charges']
# Create and train the model
model = LinearRegression().fit(inputs, targets)
# Generate predictions
predictions = model.predict(inputs)
# Compute loss to evalute the model
loss = rmse(targets, predictions)
print('Loss:', loss)
Loss: 4608.470405038247
# Create inputs and targets
inputs, targets = medical_df[['age', 'bmi', 'children']], medical_df['charges']
# Create and train the model
model = LinearRegression().fit(inputs, targets)
# Generate predictions
predictions = model.predict(inputs)
# Compute loss to evalute the model
loss = rmse(targets, predictions)
print('Loss:', loss)
Loss: 11355.317901125973
#Using Categorical Features for Machine Learning
#Binary Categories
#The "smoker" category has just two values "yes" and "no". Let's create a new column "smoker_code" containing 0 for "no" and 1 for "yes".
sns.barplot(data=medical_df, x='smoker', y='charges');
smoker_codes = {'no': 0, 'yes': 1}
medical_df['smoker_code'] = medical_df.smoker.map(smoker_codes)
medical_df.charges.corr(medical_df.smoker_code)
0.7872514304984767
medical_df
| age | sex | bmi | children | smoker | region | charges | smoker_code | |
|---|---|---|---|---|---|---|---|---|
| 0 | 19 | female | 27.900 | 0 | yes | southwest | 16884.92400 | 1 |
| 1 | 18 | male | 33.770 | 1 | no | southeast | 1725.55230 | 0 |
| 2 | 28 | male | 33.000 | 3 | no | southeast | 4449.46200 | 0 |
| 3 | 33 | male | 22.705 | 0 | no | northwest | 21984.47061 | 0 |
| 4 | 32 | male | 28.880 | 0 | no | northwest | 3866.85520 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1333 | 50 | male | 30.970 | 3 | no | northwest | 10600.54830 | 0 |
| 1334 | 18 | female | 31.920 | 0 | no | northeast | 2205.98080 | 0 |
| 1335 | 18 | female | 36.850 | 0 | no | southeast | 1629.83350 | 0 |
| 1336 | 21 | female | 25.800 | 0 | no | southwest | 2007.94500 | 0 |
| 1337 | 61 | female | 29.070 | 0 | yes | northwest | 29141.36030 | 1 |
1338 rows × 8 columns
# Create inputs and targets
inputs, targets = medical_df[['age', 'bmi', 'children', 'smoker_code']], medical_df['charges']
# Create and train the model
model = LinearRegression().fit(inputs, targets)
# Generate predictions
predictions = model.predict(inputs)
# Compute loss to evalute the model
loss = rmse(targets, predictions)
print('Loss:', loss)
Loss: 6056.439217188081
sns.barplot(data=medical_df, x='sex', y='charges')
<Axes: xlabel='sex', ylabel='charges'>
sex_codes = {'female': 0, 'male': 1}
medical_df['sex_code'] = medical_df.sex.map(sex_codes)
medical_df.charges.corr(medical_df.sex_code)
0.057292062202025366
# Create inputs and targets
inputs, targets = medical_df[['age', 'bmi', 'children', 'smoker_code', 'sex_code']], medical_df['charges']
# Create and train the model
model = LinearRegression().fit(inputs, targets)
# Generate predictions
predictions = model.predict(inputs)
# Compute loss to evalute the model
loss = rmse(targets, predictions)
print('Loss:', loss)
Loss: 6056.100708754546
#One-hot Encoding
sns.barplot(data=medical_df, x='region', y='charges');
from sklearn import preprocessing
enc = preprocessing.OneHotEncoder()
enc.fit(medical_df[['region']])
enc.categories_
[array(['northeast', 'northwest', 'southeast', 'southwest'], dtype=object)]
one_hot = enc.transform(medical_df[['region']]).toarray()
one_hot
array([[0., 0., 0., 1.],
[0., 0., 1., 0.],
[0., 0., 1., 0.],
...,
[0., 0., 1., 0.],
[0., 0., 0., 1.],
[0., 1., 0., 0.]])
medical_df[['northeast', 'northwest', 'southeast', 'southwest']] = one_hot
medical_df
| age | sex | bmi | children | smoker | region | charges | smoker_code | sex_code | northeast | northwest | southeast | southwest | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 19 | female | 27.900 | 0 | yes | southwest | 16884.92400 | 1 | 0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 1 | 18 | male | 33.770 | 1 | no | southeast | 1725.55230 | 0 | 1 | 0.0 | 0.0 | 1.0 | 0.0 |
| 2 | 28 | male | 33.000 | 3 | no | southeast | 4449.46200 | 0 | 1 | 0.0 | 0.0 | 1.0 | 0.0 |
| 3 | 33 | male | 22.705 | 0 | no | northwest | 21984.47061 | 0 | 1 | 0.0 | 1.0 | 0.0 | 0.0 |
| 4 | 32 | male | 28.880 | 0 | no | northwest | 3866.85520 | 0 | 1 | 0.0 | 1.0 | 0.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1333 | 50 | male | 30.970 | 3 | no | northwest | 10600.54830 | 0 | 1 | 0.0 | 1.0 | 0.0 | 0.0 |
| 1334 | 18 | female | 31.920 | 0 | no | northeast | 2205.98080 | 0 | 0 | 1.0 | 0.0 | 0.0 | 0.0 |
| 1335 | 18 | female | 36.850 | 0 | no | southeast | 1629.83350 | 0 | 0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 1336 | 21 | female | 25.800 | 0 | no | southwest | 2007.94500 | 0 | 0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 1337 | 61 | female | 29.070 | 0 | yes | northwest | 29141.36030 | 1 | 0 | 0.0 | 1.0 | 0.0 | 0.0 |
1338 rows × 13 columns
# Create inputs and targets
input_cols = ['age', 'bmi', 'children', 'smoker_code', 'sex_code', 'northeast', 'northwest', 'southeast', 'southwest']
inputs, targets = medical_df[input_cols], medical_df['charges']
# Create and train the model
model = LinearRegression().fit(inputs, targets)
# Generate predictions
predictions = model.predict(inputs)
# Compute loss to evalute the model
loss = rmse(targets, predictions)
print('Loss:', loss)
Loss: 6041.6796511744515
#Model Improvements
#Feature Scaling
model.coef_
array([ 256.85635254, 339.19345361, 475.50054515, 23848.53454191,
-131.3143594 , 587.00923503, 234.0453356 , -448.01281436,
-373.04175627])
model.intercept_
-12525.547811195444
weights_df = pd.DataFrame({
'feature': np.append(input_cols, 1),
'weight': np.append(model.coef_, model.intercept_)
})
weights_df
| feature | weight | |
|---|---|---|
| 0 | age | 256.856353 |
| 1 | bmi | 339.193454 |
| 2 | children | 475.500545 |
| 3 | smoker_code | 23848.534542 |
| 4 | sex_code | -131.314359 |
| 5 | northeast | 587.009235 |
| 6 | northwest | 234.045336 |
| 7 | southeast | -448.012814 |
| 8 | southwest | -373.041756 |
| 9 | 1 | -12525.547811 |
medical_df
| age | sex | bmi | children | smoker | region | charges | smoker_code | sex_code | northeast | northwest | southeast | southwest | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 19 | female | 27.900 | 0 | yes | southwest | 16884.92400 | 1 | 0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 1 | 18 | male | 33.770 | 1 | no | southeast | 1725.55230 | 0 | 1 | 0.0 | 0.0 | 1.0 | 0.0 |
| 2 | 28 | male | 33.000 | 3 | no | southeast | 4449.46200 | 0 | 1 | 0.0 | 0.0 | 1.0 | 0.0 |
| 3 | 33 | male | 22.705 | 0 | no | northwest | 21984.47061 | 0 | 1 | 0.0 | 1.0 | 0.0 | 0.0 |
| 4 | 32 | male | 28.880 | 0 | no | northwest | 3866.85520 | 0 | 1 | 0.0 | 1.0 | 0.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1333 | 50 | male | 30.970 | 3 | no | northwest | 10600.54830 | 0 | 1 | 0.0 | 1.0 | 0.0 | 0.0 |
| 1334 | 18 | female | 31.920 | 0 | no | northeast | 2205.98080 | 0 | 0 | 1.0 | 0.0 | 0.0 | 0.0 |
| 1335 | 18 | female | 36.850 | 0 | no | southeast | 1629.83350 | 0 | 0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 1336 | 21 | female | 25.800 | 0 | no | southwest | 2007.94500 | 0 | 0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 1337 | 61 | female | 29.070 | 0 | yes | northwest | 29141.36030 | 1 | 0 | 0.0 | 1.0 | 0.0 | 0.0 |
1338 rows × 13 columns
from sklearn.preprocessing import StandardScaler
numeric_cols = ['age', 'bmi', 'children']
scaler = StandardScaler()
scaler.fit(medical_df[numeric_cols])
StandardScaler()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
StandardScaler()
scaler.mean_
array([39.20702541, 30.66339686, 1.09491779])
scaler.var_
array([197.25385199, 37.16008997, 1.45212664])
scaled_inputs = scaler.transform(medical_df[numeric_cols])
scaled_inputs
array([[-1.43876426, -0.45332 , -0.90861367],
[-1.50996545, 0.5096211 , -0.07876719],
[-0.79795355, 0.38330685, 1.58092576],
...,
[-1.50996545, 1.0148781 , -0.90861367],
[-1.29636188, -0.79781341, -0.90861367],
[ 1.55168573, -0.26138796, -0.90861367]])
cat_cols = ['smoker_code', 'sex_code', 'northeast', 'northwest', 'southeast', 'southwest']
categorical_data = medical_df[cat_cols].values
inputs = np.concatenate((scaled_inputs, categorical_data), axis=1)
targets = medical_df.charges
# Create and train the model
model = LinearRegression().fit(inputs, targets)
# Generate predictions
predictions = model.predict(inputs)
# Compute loss to evalute the model
loss = rmse(targets, predictions)
print('Loss:', loss)
Loss: 6042.751556200273
weights_df = pd.DataFrame({
'feature': np.append(numeric_cols + cat_cols, 1),
'weight': np.append(model.coef_, model.intercept_)
})
weights_df
| feature | weight | |
|---|---|---|
| 0 | age | 3.608872e+03 |
| 1 | bmi | 2.058490e+03 |
| 2 | children | 5.615551e+02 |
| 3 | smoker_code | 2.385276e+04 |
| 4 | sex_code | -1.670010e+02 |
| 5 | northeast | 1.758467e+17 |
| 6 | northwest | 1.758467e+17 |
| 7 | southeast | 1.758467e+17 |
| 8 | southwest | 1.758467e+17 |
| 9 | 1 | -1.758467e+17 |
### Creating a Test Set
from sklearn.model_selection import train_test_split
inputs_train, inputs_test, targets_train, targets_test = train_test_split(inputs, targets, test_size=0.1)
# Create and train the model
model = LinearRegression().fit(inputs_train, targets_train)
# Generate predictions
predictions_test = model.predict(inputs_test)
# Compute loss to evalute the model
loss = rmse(targets_test, predictions_test)
print('Test Loss:', loss)
Test Loss: 6392.461462251641
# Generate predictions
predictions_train = model.predict(inputs_train)
# Compute loss to evalute the model
loss = rmse(targets_train, predictions_train)
print('Training Loss:', loss)
Training Loss: 6002.642998045587